1 Loading data and packages

library(readxl)
library(readr)
library(CTexploreR)
library(Vennerable)
library(biomaRt)
library(tidyverse)
library(SummarizedExperiment)
library(UpSetR)

Gene names/synonyms required for databases cleaning

ensembl <- biomaRt::useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
attributes_vector <- c("ensembl_gene_id", "external_gene_name",
                       "external_synonym", "gene_biotype",
                       "chromosome_name", "band", "start_position", "end_position",
                       "strand")
ensembl_gene_synonym <- as_tibble(getBM(attributes = attributes_vector, mart = ensembl))

ensembl_gene_synonym <- ensembl_gene_synonym %>%
  mutate(external_synonym = sub(pattern = "ORF", external_synonym, 
                                replacement = "orf"))

attributes_vector <- c("ensembl_gene_id", "external_gene_name")
ensembl_gene_names <- as_tibble(getBM(attributes = attributes_vector, mart = ensembl))

attributes_vector <- c("external_gene_name",
                       "external_synonym")
gene_synonym <- as_tibble(getBM(attributes = attributes_vector, mart = ensembl))
CT_and_CTP_genes <- CT_genes()
CT_genes <- (filter(CT_genes(), CT_gene_type == "CT_gene"))


upset_text_size <- c(2, 2, 2, 2, 3.5, 3.5)
 #c(intersection size title, intersection size tick labels, set size title, 
 # set size tick labels, set names, numbers above bars)

2 Database comparison

2.1 Lists cleaning

CT lists from other databases have been checked (using GTEx and our GTEx_expression() function and GeneCards) in order to remove duplicated gene names or deprecated ones and allow comparison between databases.

2.1.1 CTdatabase

Online list copied in a csv file, several lists exist so we combined them.

We checked gene names that were a concatenation of two genes (choice using biomaRt synonyms to get the official one), checked which ones had the right names, removed duplicated genes, verified lost genes and added back those that should be there.

CTdatabase <- read_delim("data/CTdatabase1.csv", delim = ";", 
                         escape_double = FALSE, trim_ws = TRUE)
colnames(CTdatabase) <- c("Family", "Gene_Name", "Chromosomal_localization",
                          "CT_identifier")
CTdatabase_bis <- read_csv2("data/CTdatabase2.csv")
CTdatabase <- left_join(CTdatabase, CTdatabase_bis, 
                        by = c("Gene_Name" = "Gene_Symbol"))


CTdatabase_single <- CTdatabase %>%
  mutate(Gene_Name = sub(pattern = "/.*$", Gene_Name, replacement = ""))
CTdatabase_single <- CTdatabase_single %>%
  mutate(Gene_Name = sub(pattern = ",.*$", Gene_Name, replacement = ""))


CTdatabase_official_names <- 
  unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, 
                       external_gene_name)) %>%
  filter(external_gene_name %in% CTdatabase_single$Gene_Name) %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA)
CTdatabase_synonym <- 
  ensembl_gene_synonym %>%
  filter(external_synonym %in% CTdatabase_single$Gene_Name) %>%
  mutate(Gene_Name = external_synonym) %>%
  dplyr::select(ensembl_gene_id, external_gene_name, Gene_Name, external_synonym)
CTdatabase_cleaned <- 
  rbind(CTdatabase_official_names, CTdatabase_synonym) %>% 
  left_join(CTdatabase_single)


duplicated_genes <- CTdatabase_cleaned$Gene_Name[duplicated(CTdatabase_cleaned$Gene_Name)]
bad_ids <- ensembl_gene_synonym %>%
  filter(external_gene_name %in% duplicated_genes | external_synonym %in% duplicated_genes) %>%
  filter(chromosome_name %in% grep(pattern = "H", x = chromosome_name, value = TRUE)) %>%
  pull(ensembl_gene_id)
CTdatabase_cleaned <- CTdatabase_cleaned %>%
  dplyr::filter(!ensembl_gene_id %in% bad_ids)
CTdatabase_cleaned <- CTdatabase_cleaned %>%
  filter(!ensembl_gene_id == "ENSG00000052126")
CTdatabase_cleaned <- CTdatabase_cleaned %>% 
  filter(!(ensembl_gene_id == "ENSG00000183305" & Gene_Name == "MAGEA2"))
CTdatabase_cleaned <- CTdatabase_cleaned %>% 
  filter(!ensembl_gene_id == "ENSG00000204648")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CSAG3B")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CSAG2", "external_synonym"] <- "CSAG3B"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CT45A4")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CT45A3", "external_synonym"] <- "CT45A4"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "LAGE-1b")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CTAG2", "external_synonym"] <- "LAGE-1b"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CT16.2")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "PAGE5", "external_synonym"] <- "CT16.2"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "SPANXB2")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "SPANXB1", "external_synonym"] <- "SPANXB2"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "SPANXE")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "SPANXD", "external_synonym"] <- "SPANXE"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE1C")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE1D")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE1E")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE2B")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "XAGE2", "external_synonym"] <- "XAGE2B"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CTAGE-2")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CTAGE1", "external_synonym"] <- "CTAGE-2"


CTdatabase_cleaned <- ensembl_gene_synonym %>%
  mutate(Gene_Name = external_synonym) %>%
  filter(external_synonym == "CXorf61") %>%
  dplyr::select(ensembl_gene_id, external_gene_name, Gene_Name, external_synonym) %>%
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "Cxorf61", 
                    c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name)) %>%
  filter(external_gene_name == "CCNA1") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "cyclin A1", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "GOLGA6L2") %>%
  filter(ensembl_gene_id == "ENSG00000174450") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "GOLGAGL2 FA", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "LYPD6B") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC130576", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "CT62") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC196993", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "CT75") %>%
  filter(ensembl_gene_id == "ENSG00000291155") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC440934", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "LINC01192") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC647107", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "TSPY1") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC728137", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "SSX2B") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "SSX2b", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)

CTdatabase_cleaned <- CTdatabase_cleaned[!duplicated(CTdatabase_cleaned$external_gene_name), ]


length(CTdatabase_cleaned$ensembl_gene_id[!CTdatabase_cleaned$ensembl_gene_id%in%all_genes$ensembl_gene_id])
## [1] 9

9 genes are not found in all_genes even when using gene names and ensembl_gene_id

2.1.2 Jamin’s list

Excel file coming from supplemental data.

Jamin_core_CT <- read_excel("data/Jamin_core_CT.xlsx")
Jamin_core_CT[Jamin_core_CT$Gene == "KIAA1211", "Gene"] <- "CRACD"
Jamin_core_CT[Jamin_core_CT$Gene == "CXorf67", "Gene"] <- "EZHIP"
Jamin_core_CT[Jamin_core_CT$Gene == "KIAA1109", "Gene"] <- "BLTP1"
Jamin_core_CT[Jamin_core_CT$Gene == "C10orf82", "Gene"] <- "SPMIP5"
Jamin_core_CT[Jamin_core_CT$Gene == "TTC30B", "Gene"] <- "IFT70B"
Jamin_core_CT[Jamin_core_CT$Gene == "TTC30A", "Gene"] <- "IFT70A"

All gene names have been linked to the right name in all_genes

2.1.3 Wang’s CTatlas

Excel file coming from supplemental data.

Wang_CT <- read_excel("data/Wang_Suppl_Data_3.xlsx", 
    sheet = "Supplementary Data 3B", skip = 1)
colnames(Wang_CT)[1] <- "ensembl_gene_id"

Wang_CT <- ensembl_gene_names %>% 
  filter(ensembl_gene_id %in% Wang_CT$ensembl_gene_id) %>%
  right_join(Wang_CT)

Wang_CT[Wang_CT$ensembl_gene_id == "ENSG00000181013", "external_gene_name"] <- "C17orf47"
Wang_CT[Wang_CT$ensembl_gene_id == "ENSG00000204293", "external_gene_name"] <- "OR8B2"


length(Wang_CT$external_gene_name[!Wang_CT$external_gene_name %in% all_genes$external_gene_name])
## [1] 12

12 genes are not found in all_genes, by gene names or ensemble gene id and no other name.

2.1.4 Carter’s list

Carter_CT_list <- read_table("data/Carter_CT_list.txt", skip = 1)
Carter_CT <- Carter_CT_list[Carter_CT_list$CT_Expression,]

Carter_CT[Carter_CT$Gene == "ENSG00000261649", "Gene_Name"] <- "GOLGA6L7"
Carter_CT[Carter_CT$Gene == "ENSG00000239620", "Gene_Name"] <- "PRR20G"
Carter_CT[Carter_CT$Gene == "ENSG00000168148", "Gene_Name"] <- "H3-4"
Carter_CT[Carter_CT$Gene == "ENSG00000204296", "Gene_Name"] <- "TSBP1"
Carter_CT[Carter_CT$Gene == "ENSG00000180219", "Gene_Name"] <- "GARIN6"
Carter_CT[Carter_CT$Gene == "ENSG00000172717", "Gene_Name"] <- "GARIN2"
Carter_CT[Carter_CT$Gene == "ENSG00000174015", "Gene_Name"] <- "CBY2"
Carter_CT[Carter_CT$Gene == "ENSG00000224960", "Gene_Name"] <- "PPP4R3C"
Carter_CT[Carter_CT$Gene == "ENSG00000221843", "Gene_Name"] <- "SPATA31H1"
Carter_CT[Carter_CT$Gene == "ENSG00000177947", "Gene_Name"] <- "CIMAP1A"
Carter_CT[Carter_CT$Gene == "ENSG00000172073", "Gene_Name"] <- "SPMIP9"
Carter_CT[Carter_CT$Gene == "ENSG00000229894", "Gene_Name"] <- "GK3"
Carter_CT[Carter_CT$Gene == "ENSG00000249693", "Gene_Name"] <- "SPMAP2L"
Carter_CT[Carter_CT$Gene == "ENSG00000173728", "Gene_Name"] <- "SPMIP3"

length(Carter_CT$Gene[!Carter_CT$Gene %in% all_genes$ensembl_gene_id])
## [1] 1

1 gene cannot be found in all_genes even by ensemble or gene name

2.1.5 Bruggeman’s list

Excel file from supplemental data.

Bruggeman_data <- read_excel("data/Bruggeman_suppl_data.xlsx", skip = 1,
                           sheet = "1D")

Bruggeman_official_names <- gene_synonym %>% 
  dplyr::select(external_gene_name) %>% 
  unique() %>% 
  filter(external_gene_name %in% Bruggeman_data$Gene) %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA)

Bruggeman_synonym <- gene_synonym %>%
  filter(external_synonym %in% Bruggeman_data$Gene) %>%
  mutate(Gene_Name = external_synonym) %>%
  dplyr::select(external_gene_name, Gene_Name, external_synonym)

Bruggeman_synonym <- Bruggeman_synonym[-which(Bruggeman_synonym$Gene_Name %in% 
                          Bruggeman_official_names$Gene_Name),]

Bruggeman_CT <- rbind(Bruggeman_official_names, Bruggeman_synonym)

lost <- Bruggeman_data[which(!Bruggeman_data$Gene %in% 
                               c(Bruggeman_CT$Gene_Name)), "Gene"]
colnames(lost) <- "external_gene_name"
lost$Gene_Name <- rep(NA, nrow(lost))
lost$external_synonym <- rep(NA, nrow(lost))

lost[lost$external_gene_name == "C21orf59", "Gene_Name"] <- "CFAP298"
lost[lost$external_gene_name == "C11orf57", "Gene_Name"] <- "NKAPD1"
lost[lost$external_gene_name == "C7orf55", "Gene_Name"] <- "FMC1"
lost[lost$external_gene_name == "C10orf12", "Gene_Name"] <- "LCOR"
lost[lost$external_gene_name == "RPL19P12", "Gene_Name"] <- "RPL19P12"
lost[lost$external_gene_name == "C16orf59", "Gene_Name"] <- "TEDC2"
lost[lost$external_gene_name == "TTTY15", "Gene_Name"] <- "USP9Y"
lost[lost$external_gene_name == "C17orf53", "Gene_Name"] <- "HROB"
lost[lost$external_gene_name == "C1orf112", "Gene_Name"] <- "FIRRM"
lost[lost$external_gene_name == "C12orf66", "Gene_Name"] <- "KICS2"
lost[lost$external_gene_name == "C9orf84", "Gene_Name"] <- "SHOC1"
lost[lost$external_gene_name == "C10orf25", "Gene_Name"] <- "ZNF22-AS1"
lost[lost$external_gene_name == "C20orf197", "Gene_Name"] <- "LINC02910"
lost[lost$external_gene_name == "C3orf67", "Gene_Name"] <- "CFAP20DC"
lost[lost$external_gene_name == "C8orf37", "Gene_Name"] <- "CFAP418"
lost[lost$external_gene_name == "C22orf34", "Gene_Name"] <- "MIR3667HG"
lost[lost$external_gene_name == "C9orf131", "Gene_Name"] <- "SPATA31G1"
lost[lost$external_gene_name == "C17orf80", "Gene_Name"] <- "MTNAP1"

Bruggeman_CT <- rbind(Bruggeman_CT, lost) 

missing_Bruggeman <- Bruggeman_CT[!Bruggeman_CT$Gene_Name %in% 
                                    all_genes$external_gene_name, ]$Gene_Name

external_names_to_keep <- gene_synonym %>%
   filter(external_synonym %in% missing_Bruggeman) %>% # take those in Bruggeman that are synonym
   filter(external_gene_name %in% all_genes$external_gene_name) %>% # but that are actually in all_genes with another name
   mutate(Gene_Name = external_gene_name)
 
Bruggeman_CT[Bruggeman_CT$external_synonym %in% 
                    external_names_to_keep$external_synonym, 
                  "Gene_Name"] <- external_names_to_keep$Gene_Name

Bruggeman_CT <- Bruggeman_CT %>% 
  dplyr::select(Gene_Name)


length(Bruggeman_CT$Gene_Name[!Bruggeman_CT$Gene_Name %in% 
                                all_genes$external_gene_name])
## [1] 39

39 genes are not found with their names in all genes

2.2 CTexploreR data for selection pipeline

To characterise the differences between our database and other, we need the category we created in CTexploreR. For this, we have the object all_genes in CTdata that contains the CT analysis for all genes. More info in

Hereunder is what we used for our selection pipeline (coming from make_all_genes_prelim.R and 130_make_all_genes_and_CT_genes.R in CTdata).

all_genes

From there, we filtered based on the testis_specificity (“testis_specific”, which is based on expression in health tissue and scRNA seq info from HPA), CCLE_category (“activated”) and TCGA_category (“activated” or “multimapping_issue”) to have our CT genes. Then, when wanting to validate TSS manually, we realised that for some genes, reads were not properly aligned to exons which might reflect a poorly defined transcription in these regions and are hence likely unreliable.

Some genes were also characterized as Cancer-Testis preferential genes when testis specificity was less stringent

2.3 CTexploreR VS CTdatabase

CTdatabase_ours <- Venn(list(CTexploreR = CT_genes$external_gene_name,
                             CTdatabase = CTdatabase_cleaned$external_gene_name))
gp <- VennThemes(compute.Venn(CTdatabase_ours))
gp[["Face"]][["11"]]$fill <-  "mediumaquamarine"
gp[["Face"]][["10"]]$fill <-  "darkseagreen1"
gp[["Face"]][["01"]]$fill <-  "lightsteelblue1"
gp[["Set"]][["Set2"]]$col <-  "paleturquoise4"
gp[["Set"]][["Set1"]]$col <-  "darkseagreen4"
gp[["SetText"]][["Set2"]]$col <-  "paleturquoise4"
gp[["SetText"]][["Set1"]]$col <-  "darkseagreen4"
gp[["FaceText"]][["11"]]$fontsize <- 30
gp[["FaceText"]][["10"]]$fontsize <- 30
gp[["FaceText"]][["01"]]$fontsize <- 30
gp[["SetText"]][["Set2"]]$fontsize <-  30
gp[["SetText"]][["Set1"]]$fontsize <- 30
plot(CTdatabase_ours, gp = gp)

We find 25.6097561 % of CTdatabase in CTexploreR, which is 43.1506849% of our database.

Lost genes analysis

CTdatabase_lost <- all_genes %>%
  filter(external_gene_name %in% CTdatabase_ours@IntersectionSets[["01"]])

# 9 Genes are lost because not in any database like before

table(CTdatabase_lost$testis_specificity)
## 
## not_testis_specific testis_preferential     testis_specific 
##                 101                  29                  44
table(CTdatabase_lost$CT_gene_type)
## 
## CTP_gene    other 
##       23      151
table(CTdatabase_lost$not_detected_in_somatic_HPA)
## 
## FALSE  TRUE 
##    51    85
table(CTdatabase_lost$TCGA_category)
## 
##          activated              leaky    lowly_activated multimapping_issue 
##                 78                 29                  2                 28 
##      not_activated 
##                 37
table(CTdatabase_lost$CCLE_category)
## 
##       activated           leaky lowly_activated   not_activated 
##              89              26               4              55
table(CTdatabase_lost$TCGA_category, CTdatabase_lost$CCLE_category)
##                     
##                      activated leaky lowly_activated not_activated
##   activated                 60     1               3            14
##   leaky                      4    25               0             0
##   lowly_activated            1     0               1             0
##   multimapping_issue        14     0               0            14
##   not_activated             10     0               0            27
CTdatabase_lost_upset <- 
  list(`Not testis specific` = 
         filter(CTdatabase_lost,
                testis_specificity != "testis_specific" & 
                  CT_gene_type == "other")$external_gene_name,
       `Not tumour activated` = 
         filter(CTdatabase_lost,
                (TCGA_category != "activated" &
                  TCGA_category != "multimapping_issue")|
                   CCLE_category != "activated")$external_gene_name,
       `CT preferential` =
         filter(CTdatabase_lost,
                CT_gene_type == "CTP_gene")$external_gene_name)

CTdatabase_lost$external_gene_name[!CTdatabase_lost$external_gene_name %in% unlist(CTdatabase_lost_upset)]
## character(0)
upset_CTdatabase <- fromList(CTdatabase_lost_upset)

upset(upset_CTdatabase,
      text.scale = upset_text_size, 
      mb.ratio = c(0.6, 0.4))

74.7126437 % of these genes are not testis specific.

However 23 of these lost genes are flagged as Cancer-Testis preferential in our analysis.

65.5172414 % are not properly activated in tumors and/or cancer cell lines.

There is one gene (SPANXN1) that is missing (in 165 from CTdatabase lost) but that is not on the upset plot (164 on upset plot) as it was removed because of the TSS.

In their analysis, they had characterised gene specificity, some being not available, not found in testis, testis-restricted, testis-selective and testis/brain-restricted. Let’s see how the lost genes qualify as they didn’t mention those were strictly testis specific.

CTdatabase_cleaned %>% 
  filter(external_gene_name %in% CTdatabase_lost$external_gene_name) %>% 
  pull(Classification) %>% 
  table()
## .
##           not available     not found in testis       testis-restricted 
##                      95                       2                      16 
##        testis-selective testis/brain-restricted 
##                      54                       7

We can see that most of them had no info or were testis-selective (I couldn’t find on website or paper how they selected categories).

all_genes %>%
  filter(external_gene_name %in% CTdatabase_ours@IntersectionSets[["10"]]) %>% 
  pull(regulated_by_methylation) %>% 
  table()
## .
## FALSE  TRUE 
##    31    51
all_genes %>%
  filter(external_gene_name %in% CTdatabase_ours@IntersectionSets[["10"]]) %>% 
  pull(transcript_biotype) %>% 
  table()
## .
##         lncRNA protein_coding 
##             27             56
table(CT_genes$transcript_biotype)
## 
##         lncRNA protein_coding 
##             29            117
#45 lncRNA in all CT_genes

all_genes %>%
  filter(external_gene_name %in% CTdatabase_ours@IntersectionSets[["10"]]) %>% 
  pull(CpG_promoter) %>% 
  table()
## .
##         high intermediate          low 
##           30           28           25
table(CT_genes$CpG_promoter)
## 
##         high intermediate          low 
##           44           69           33
all_genes %>%
  filter(external_gene_name %in% CTdatabase_ours@IntersectionSets[["10"]])

2.4 CTexploreR VS omics databases

core_ours <- Venn(list(CTexploreR = CT_genes$external_gene_name,
                       Jamin = Jamin_core_CT$Gene))

Wang_ours <- Venn(list(CTexploreR = CT_genes$external_gene_name,
                       Wang = Wang_CT$external_gene_name))

Carter_ours <- Venn(list(CTexploreR = CT_genes$external_gene_name,
                         Carter_CT = Carter_CT$Gene_Name))

Bruggeman_ours <- Venn(list(CTexploreR = CT_genes$external_gene_name,
                            Bruggeman = Bruggeman_CT$Gene_Name))

gene_list <- list(CTexploreR = CT_genes$external_gene_name,
                  Carter = Carter_CT$Gene_Name,
                  Jamin = Jamin_core_CT$Gene, 
                  CTatlas = Wang_CT$external_gene_name,
                  Bruggeman = Bruggeman_CT$Gene_Name)

upset_omics <- fromList(gene_list)
upset(upset_omics)

4 in all, 60 in at least 3 databases

Lost genes analysis

plot(core_ours, gp = gp)

Jamin_lost <- all_genes %>%
  filter(external_gene_name %in% core_ours@IntersectionSets[["01"]])

table(Jamin_lost$testis_specificity)
## 
## not_testis_specific testis_preferential     testis_specific 
##                  81                  13                   5
table(Jamin_lost$CT_gene_type)
## 
## CTP_gene    other 
##       10       89
table(Jamin_lost$not_detected_in_somatic_HPA)
## 
## FALSE  TRUE 
##    45    36
table(Jamin_lost$TCGA_category)
## 
##          activated              leaky multimapping_issue      not_activated 
##                 34                 42                 19                  4
table(Jamin_lost$CCLE_category)
## 
##       activated           leaky lowly_activated   not_activated 
##              55              35               1               8
table(Jamin_lost$TCGA_category, Jamin_lost$CCLE_category)
##                     
##                      activated leaky lowly_activated not_activated
##   activated                 28     2               1             3
##   leaky                      9    33               0             0
##   multimapping_issue        16     0               0             3
##   not_activated              2     0               0             2
Jamin_lost_upset <- 
  list(`Not testis specific` = 
         filter(Jamin_lost,
                testis_specificity != "testis_specific" &
                  CT_gene_type == "other")$external_gene_name,
       `Not tumour activated` = 
         filter(Jamin_lost,
                (TCGA_category != "activated" &
                  TCGA_category != "multimapping_issue")|
                   CCLE_category != "activated")$external_gene_name,
       `CT preferential` =
         filter(Jamin_lost,
                CT_gene_type == "CTP_gene")$external_gene_name)

upset_Jamin <- fromList(Jamin_lost_upset)
upset(upset_Jamin,
      text.scale = upset_text_size)

We find 20.8 % of CTdatabase in CTexploreR, which is 17.8082192 % of our database.

94.9494949% of these genes are not testis specific.

However 10 of these lost genes are flagged as Cancer-Testis preferential in our analysis.

71.7171717 % are not properly activated in tumors and/or cancer cell lines.

plot(Wang_ours, gp = gp)

Wang_lost <- all_genes %>%
  filter(external_gene_name %in% Wang_ours@IntersectionSets[["01"]])

# 12 genes lost because no info like before

table(Wang_lost$testis_specificity)
## 
## not_testis_specific testis_preferential     testis_specific 
##                 574                 141                 216
table(Wang_lost$CT_gene_type)
## 
## CTP_gene    other 
##       79      852
table(Wang_lost$not_detected_in_somatic_HPA)
## 
## FALSE  TRUE 
##   254   601
table(Wang_lost$TCGA_category)
## 
##          activated              leaky    lowly_activated multimapping_issue 
##                464                163                  9                 69 
##      not_activated 
##                226
table(Wang_lost$CCLE_category)
## 
##       activated           leaky lowly_activated   not_activated 
##             405             121              40             365
table(Wang_lost$TCGA_category, Wang_lost$CCLE_category)
##                     
##                      activated leaky lowly_activated not_activated
##   activated                308     9              28           119
##   leaky                     47   112               2             2
##   lowly_activated            2     0               3             4
##   multimapping_issue        20     0               1            48
##   not_activated             28     0               6           192
Wang_lost_upset <- 
  list(`Not testis specific` = 
         filter(Wang_lost,
                testis_specificity != "testis_specific" &
                  CT_gene_type == "other")$external_gene_name,
       `Not tumour activated` = 
         filter(Wang_lost,
                (TCGA_category != "activated" &
                  TCGA_category != "multimapping_issue")|
                   CCLE_category != "activated")$external_gene_name,
       `CT preferential` =
         filter(Wang_lost,
                CT_gene_type == "CTP_gene")$external_gene_name)

upset_Wang <- fromList(Wang_lost_upset)
upset(upset_Wang,
      text.scale = upset_text_size)

We find 7.4582924 % of CTdatabase in CTexploreR, which is 52.0547945 % of our database.

76.7991407% of these genes are not testis specific.

However 79 of these lost genes are flagged as Cancer-Testis preferential in our analysis.

66.9172932 % are not properly activated in tumors and/or cancer cell lines.

plot(Carter_ours, gp = gp)

Carter_lost <- all_genes %>%
  filter(external_gene_name %in% Carter_ours@IntersectionSets[["01"]])

# 1 lost because no info, like before

table(Carter_lost$testis_specificity)
## 
## not_testis_specific testis_preferential     testis_specific 
##                  22                   9                  32
table(Carter_lost$CT_gene_type)
## 
## CTP_gene    other 
##        4       59
table(Carter_lost$not_detected_in_somatic_HPA)
## 
## FALSE  TRUE 
##    13    43
table(Carter_lost$TCGA_category)
## 
##       activated           leaky lowly_activated   not_activated 
##              44               1               2              16
table(Carter_lost$CCLE_category)
## 
##       activated lowly_activated   not_activated 
##              16               9              38
table(Carter_lost$TCGA_category, Carter_lost$CCLE_category)
##                  
##                   activated lowly_activated not_activated
##   activated              14               8            22
##   leaky                   0               0             1
##   lowly_activated         1               0             1
##   not_activated           1               1            14
Carter_lost_upset <- 
  list(`Not testis specific` = 
         filter(Carter_lost,
                testis_specificity != "testis_specific" &
                  CT_gene_type == "other")$external_gene_name,
       `Not tumour activated` = 
         filter(Carter_lost,
                (TCGA_category != "activated" &
                  TCGA_category != "multimapping_issue")|
                   CCLE_category != "activated")$external_gene_name,
       `CT preferential` =
         filter(Carter_lost,
                CT_gene_type == "CTP_gene")$external_gene_name)

upset_Carter <- fromList(Carter_lost_upset)
upset(upset_Carter,
      text.scale = upset_text_size)

We find 37.8640777 % of CTdatabase in CTexploreR, which is 26.7123288 % of our database.

49.2063492% of these genes are not testis specific.

However 4 of these lost genes are flagged as Cancer-Testis preferential in our analysis.

77.7777778 % are not properly activated in tumors and/or cancer cell lines.

plot(Bruggeman_ours, gp = gp)

Bruggeman_lost <- all_genes %>%
  filter(external_gene_name %in% Bruggeman_ours@IntersectionSets[["01"]])

# 38 lost, like before

table(Bruggeman_lost$testis_specificity)
## 
## not_testis_specific testis_preferential     testis_specific 
##                 669                  26                  10
table(Bruggeman_lost$CT_gene_type)
## 
## CTP_gene    other 
##       20      685
table(Bruggeman_lost$not_detected_in_somatic_HPA)
## 
## FALSE  TRUE 
##   441   169
table(Bruggeman_lost$TCGA_category)
## 
##          activated              leaky    lowly_activated multimapping_issue 
##                290                381                  6                 11 
##      not_activated 
##                 17
table(Bruggeman_lost$CCLE_category)
## 
##       activated           leaky lowly_activated   not_activated 
##             316             360              11              18
table(Bruggeman_lost$TCGA_category, Bruggeman_lost$CCLE_category)
##                     
##                      activated leaky lowly_activated not_activated
##   activated                239    31               7            13
##   leaky                     56   323               2             0
##   lowly_activated            1     4               0             1
##   multimapping_issue         9     0               1             1
##   not_activated             11     2               1             3
Bruggeman_lost_upset <- 
  list(`Not testis specific` = 
         filter(Bruggeman_lost,
                testis_specificity != "testis_specific" &
                  CT_gene_type == "other")$external_gene_name,
       `Not tumour activated` = 
         filter(Bruggeman_lost,
                (TCGA_category != "activated" &
                  TCGA_category != "multimapping_issue")|
                   CCLE_category != "activated")$external_gene_name,
       `CT preferential` =
         filter(Bruggeman_lost,
                CT_gene_type == "CTP_gene")$external_gene_name)

upset_Bruggeman <- fromList(Bruggeman_lost_upset)
upset(upset_Bruggeman,
      text.scale = upset_text_size)

We find 1.5873016 % of CTdatabase in CTexploreR, which is 8.2191781 % of our database.

98.5815603% of these genes are not testis specific.

However 20 of these lost genes are flagged as Cancer-Testis preferential in our analysis.

66.0992908 % are not properly activated in tumors and/or cancer cell lines.

2.5 Characterisation of differences with all databases

all_database_gene_list <- list(CTexploreR = CT_genes$external_gene_name,
                               Carter = Carter_CT$Gene_Name,
                               Jamin = Jamin_core_CT$Gene,
                               CTatlas = Wang_CT$external_gene_name,
                               Bruggeman = Bruggeman_CT$Gene_Name,
                               CTdatabase = CTdatabase_cleaned$external_gene_name)

upset_all_database <- fromList(all_database_gene_list)

upset_text_size_all <- c(2, 2, 2, 2, 3, 2)
 #c(intersection size title, intersection size tick labels, set size title, 
 # set size tick labels, set names, numbers above bars)
upset(upset_all_database,
      nsets = 6,
      nintersects = 50,
      text.scale = upset_text_size_all,
      mb.ratio = c(0.6, 0.4))

Venn_all_database <- Venn(all_database_gene_list)
Venn_all_database@IntersectionSets[["111111"]]
## [1] "MAGEB2" "MAGEA4" "MAGEA3"
common <- unique(c(core_ours@IntersectionSets[["11"]], 
                   CTdatabase_ours@IntersectionSets[["11"]], 
                   Wang_ours@IntersectionSets[["11"]], 
                   Carter_ours@IntersectionSets[["11"]],
                   Bruggeman_ours@IntersectionSets[["11"]]))

length(common)
## [1] 98
length(common)/dim(CT_genes)[1] * 100
## [1] 67.12329
lost_list <- unique(c(core_ours@IntersectionSets[["01"]],
                      CTdatabase_ours@IntersectionSets[["01"]],
                      Wang_ours@IntersectionSets[["01"]],
                      Carter_ours@IntersectionSets[["01"]],
                      Bruggeman_ours@IntersectionSets[["01"]]))

lost <- all_genes %>%
  filter(external_gene_name %in% lost_list)

all_lost_upset <- 
  list(`Not testis specific` = 
         filter(lost,
                testis_specificity != "testis_specific" &
                  CT_gene_type == "other")$external_gene_name,
       `Not tumour activated` = 
         filter(lost,
                (TCGA_category != "activated" &
                  TCGA_category != "multimapping_issue")|
                   CCLE_category != "activated")$external_gene_name,
       `CT preferential` =
         filter(lost,
                CT_gene_type == "CTP_gene")$external_gene_name)

upset_all <- fromList(all_lost_upset)
upset(upset_all,
      text.scale = upset_text_size)

not_specific <- filter(lost, testis_specificity == "not_testis_specific")

GTEX_expression(not_specific$external_gene_name, units = "log_TPM")
## see ?CTdata and browseVignettes('CTdata') for documentation
## loading from cache

somatic_testis <- filter(lost, not_detected_in_somatic_HPA == FALSE)

testis_expression(somatic_testis$external_gene_name, cells = "all")
## see ?CTdata and browseVignettes('CTdata') for documentation
## loading from cache
## Warning: 11 out of 711 names invalid: MPL, EDAR, ASB14, F2RL2, PCDHGB2, FOXO3B,
## KRT33B, SEMG1, ADORA2A, DAZ2, DAZ4.
## See the manual page for valid types.
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

HPA_cell_type_expression(somatic_testis$external_gene_name)
## see ?CTdata and browseVignettes('CTdata') for documentation
## loading from cache

not_TCGA_activated <- filter(lost, TCGA_category != "activated" & 
                               TCGA_category != "multimapping_issue")

TCGA_expression(not_TCGA_activated$external_gene_name,
                tumor = "all",
                units = "log_TPM")
## see ?CTdata and browseVignettes('CTdata') for documentation
## loading from cache
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

not_CCLE_activated <- filter(lost, CCLE_category  != "activated")

CCLE_expression(not_CCLE_activated$external_gene_name,
                  type = c("lung", "skin", "colorectal",
                           "gastric", "breast", "head_and_neck"),
                units = "log_TPM")
## see ?CTdata and browseVignettes('CTdata') for documentation
## loading from cache

transcript_prob <- lost %>% 
  pull(IGV_backbone) %>% 
  table()

98 genes in our CTexploreR database are found in at least one of the other database, which represents 67.1232877%.

The 3 genes found in all databases are MAGEB2, MAGEA4, MAGEA3.

We have lost 1713 genes in total. Among them, 72.9130181% are not considered testis specific, 41.5061296% are expressed in somatic cells, 48.1611208% are not activated in TCGA samples, 54.5826036% are not activated in CCLE cell lines and 0.5837712% is lost due to transcripts problems.

What about new genes in CTexploreR

new <- CT_genes %>% 
  filter(external_gene_name %in% Venn_all_database@IntersectionSets[["100000"]])

new
table(new$testis_specificity)
## 
## testis_specific 
##              48
table(new$X_linked)
## 
## FALSE  TRUE 
##    36    12
table(new$regulated_by_methylation)
## 
## FALSE  TRUE 
##    16    31
table(new$X_linked, new$regulated_by_methylation)
##        
##         FALSE TRUE
##   FALSE    16   19
##   TRUE      0   12
table(new$CpG_promoter)
## 
##         high intermediate          low 
##           15           19           14
TCGA_expression(tumor = "all", genes = new$external_gene_name, 
                units = "log_TPM")
## see ?CTdata and browseVignettes('CTdata') for documentation
## loading from cache
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

TCGA_expression(tumor = "all", 
                genes = filter(new, X_linked & regulated_by_methylation)$external_gene_name, 
                units = "log_TPM")
## see ?CTdata and browseVignettes('CTdata') for documentation
## loading from cache
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

testis_expression(genes = new$external_gene_name, "germ_cells")
## see ?CTdata and browseVignettes('CTdata') for documentation
## loading from cache
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

There are 48 new CT genes in CTexploreR. These are all testis specific and mainly on autosomes. Regulation by methylation is the majority of them. There is only 11 new “major” CT that are on the X chromosome and regulated by methylation. CT45 are not that new.

Expression in tumours doesn’t strike that much.

3 SessionInfo

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: Europe/Brussels
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] SingleCellExperiment_1.28.1 UpSetR_1.4.0               
##  [3] SummarizedExperiment_1.36.0 Biobase_2.66.0             
##  [5] GenomicRanges_1.58.0        GenomeInfoDb_1.42.3        
##  [7] IRanges_2.40.1              S4Vectors_0.44.0           
##  [9] BiocGenerics_0.52.0         MatrixGenerics_1.18.1      
## [11] matrixStats_1.5.0           lubridate_1.9.4            
## [13] forcats_1.0.0               stringr_1.5.1              
## [15] dplyr_1.1.4                 purrr_1.0.4                
## [17] tidyr_1.3.1                 tibble_3.2.1               
## [19] ggplot2_3.5.1               tidyverse_2.0.0            
## [21] biomaRt_2.62.1              Vennerable_3.1.0.9000      
## [23] CTexploreR_1.2.0            CTdata_1.6.0               
## [25] readr_2.1.5                 readxl_1.4.5               
## 
## loaded via a namespace (and not attached):
##   [1] DBI_1.2.3               RBGL_1.82.0             gridExtra_2.3          
##   [4] httr2_1.1.1             rlang_1.1.5             magrittr_2.0.3         
##   [7] clue_0.3-66             GetoptLong_1.0.5        compiler_4.4.1         
##  [10] RSQLite_2.3.9           reshape2_1.4.4          png_0.1-8              
##  [13] vctrs_0.6.5             pkgconfig_2.0.3         shape_1.4.6.1          
##  [16] crayon_1.5.3            fastmap_1.2.0           magick_2.8.6           
##  [19] dbplyr_2.5.0            XVector_0.46.0          labeling_0.4.3         
##  [22] rmarkdown_2.29          tzdb_0.5.0              graph_1.84.1           
##  [25] UCSC.utils_1.2.0        bit_4.6.0               xfun_0.51              
##  [28] zlibbioc_1.52.0         cachem_1.1.0            jsonlite_1.9.1         
##  [31] progress_1.2.3          blob_1.2.4              DelayedArray_0.32.0    
##  [34] prettyunits_1.2.0       parallel_4.4.1          cluster_2.1.8.1        
##  [37] R6_2.6.1                stringi_1.8.4           bslib_0.9.0            
##  [40] RColorBrewer_1.1-3      jquerylib_0.1.4         cellranger_1.1.0       
##  [43] Rcpp_1.0.14             iterators_1.0.14        knitr_1.50             
##  [46] timechange_0.3.0        Matrix_1.7-3            tidyselect_1.2.1       
##  [49] rstudioapi_0.17.1       abind_1.4-8             yaml_2.3.10            
##  [52] doParallel_1.0.17       codetools_0.2-20        curl_6.2.2             
##  [55] plyr_1.8.9              lattice_0.22-6          withr_3.0.2            
##  [58] KEGGREST_1.46.0         evaluate_1.0.3          BiocFileCache_2.14.0   
##  [61] xml2_1.3.8              circlize_0.4.16         ExperimentHub_2.14.0   
##  [64] Biostrings_2.74.1       pillar_1.10.1           BiocManager_1.30.25    
##  [67] filelock_1.0.3          foreach_1.5.2           generics_0.1.3         
##  [70] vroom_1.6.5             BiocVersion_3.20.0      hms_1.1.3              
##  [73] munsell_0.5.1           scales_1.3.0            glue_1.8.0             
##  [76] tools_4.4.1             AnnotationHub_3.14.0    Cairo_1.6-2            
##  [79] grid_4.4.1              AnnotationDbi_1.68.0    colorspace_2.1-1       
##  [82] GenomeInfoDbData_1.2.13 cli_3.6.4               rappdirs_0.3.3         
##  [85] S4Arrays_1.6.0          ComplexHeatmap_2.22.0   gtable_0.3.6           
##  [88] sass_0.4.9              digest_0.6.37           SparseArray_1.6.2      
##  [91] ggrepel_0.9.6           farver_2.1.2            rjson_0.2.23           
##  [94] memoise_2.0.1           htmltools_0.5.8.1       lifecycle_1.0.4        
##  [97] httr_1.4.7              mime_0.13               GlobalOptions_0.1.2    
## [100] bit64_4.6.0-1